#coding:utf8

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType

# 读取parquet文件创建DF
# parquet:是Spark常用的一种列式存储文件格式,优点有:1.列式存储 2.内置schema 3.序列化存储,体积小(不支持直接打开查看数据)
if __name__ == '__main__':
    # 构建SparkSession执行环境入口对象
    spark = SparkSession.builder.\
        appName("test_parque").\
        master("local[*]").\
        getOrCreate()

    df = spark.read.format("parquet").load("../data/input/sql/users.parquet")

    df.printSchema()
    df.show()
